In [43]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression  
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, GridSearchCV
In [44]:
data=pd.read_csv(r'C:\Users\prati\OneDrive\Desktop\Pract_2\Churn_Modelling.csv')
data
# exited is the dependent variable and others are independent
Out[44]:
RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1
1 2 15647311 Hill 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0
2 3 15619304 Onio 502 France Female 42 8 159660.80 3 1 0 113931.57 1
3 4 15701354 Boni 699 France Female 39 1 0.00 2 0 0 93826.63 0
4 5 15737888 Mitchell 850 Spain Female 43 2 125510.82 1 1 1 79084.10 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9995 9996 15606229 Obijiaku 771 France Male 39 5 0.00 2 1 0 96270.64 0
9996 9997 15569892 Johnstone 516 France Male 35 10 57369.61 1 1 1 101699.77 0
9997 9998 15584532 Liu 709 France Female 36 7 0.00 1 0 1 42085.58 1
9998 9999 15682355 Sabbatini 772 Germany Male 42 3 75075.31 2 1 0 92888.52 1
9999 10000 15628319 Walker 792 France Female 28 4 130142.79 1 1 0 38190.78 0

10000 rows × 14 columns

LabelEncoding¶

In [45]:
col = ['Geography', 'Gender']
lbe = preprocessing.LabelEncoder()
# This initializes a LabelEncoder object from the preprocessing module in scikit-learn.

for i in col:
    data[i] = lbe.fit_transform(data[i])
data
Out[45]:
RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
0 1 15634602 Hargrave 619 0 0 42 2 0.00 1 1 1 101348.88 1
1 2 15647311 Hill 608 2 0 41 1 83807.86 1 0 1 112542.58 0
2 3 15619304 Onio 502 0 0 42 8 159660.80 3 1 0 113931.57 1
3 4 15701354 Boni 699 0 0 39 1 0.00 2 0 0 93826.63 0
4 5 15737888 Mitchell 850 2 0 43 2 125510.82 1 1 1 79084.10 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9995 9996 15606229 Obijiaku 771 0 1 39 5 0.00 2 1 0 96270.64 0
9996 9997 15569892 Johnstone 516 0 1 35 10 57369.61 1 1 1 101699.77 0
9997 9998 15584532 Liu 709 0 0 36 7 0.00 1 0 1 42085.58 1
9998 9999 15682355 Sabbatini 772 1 1 42 3 75075.31 2 1 0 92888.52 1
9999 10000 15628319 Walker 792 0 0 28 4 130142.79 1 1 0 38190.78 0

10000 rows × 14 columns

0 is Female and 1 is Male

In exited 1 means customer left the bank and 0 means customer does not left the bank
and label encoding to convert the categorical data into numerical.

In [46]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  int32  
 5   Gender           10000 non-null  int32  
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int32(2), int64(9), object(1)
memory usage: 1015.8+ KB
In [47]:
data.shape
Out[47]:
(10000, 14)
In [48]:
print("Number of rows :",data.shape[0])
print("Number of columns :",data.shape[1])
Number of rows : 10000
Number of columns : 14
In [49]:
data.describe()    #statistical info of data
# standard deviation (std)
Out[49]:
RowNumber CustomerId CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
count 10000.00000 1.000000e+04 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.00000 10000.000000 10000.000000 10000.000000
mean 5000.50000 1.569094e+07 650.528800 0.746300 0.545700 38.921800 5.012800 76485.889288 1.530200 0.70550 0.515100 100090.239881 0.203700
std 2886.89568 7.193619e+04 96.653299 0.827529 0.497932 10.487806 2.892174 62397.405202 0.581654 0.45584 0.499797 57510.492818 0.402769
min 1.00000 1.556570e+07 350.000000 0.000000 0.000000 18.000000 0.000000 0.000000 1.000000 0.00000 0.000000 11.580000 0.000000
25% 2500.75000 1.562853e+07 584.000000 0.000000 0.000000 32.000000 3.000000 0.000000 1.000000 0.00000 0.000000 51002.110000 0.000000
50% 5000.50000 1.569074e+07 652.000000 0.000000 1.000000 37.000000 5.000000 97198.540000 1.000000 1.00000 1.000000 100193.915000 0.000000
75% 7500.25000 1.575323e+07 718.000000 1.000000 1.000000 44.000000 7.000000 127644.240000 2.000000 1.00000 1.000000 149388.247500 0.000000
max 10000.00000 1.581569e+07 850.000000 2.000000 1.000000 92.000000 10.000000 250898.090000 4.000000 1.00000 1.000000 199992.480000 1.000000
In [50]:
data.columns
Out[50]:
Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

Dropping of useless columns¶

In [51]:
data.drop(['RowNumber','Surname','CustomerId'],axis=1,inplace=True)
data.shape
Out[51]:
(10000, 11)
In [52]:
data.columns
Out[52]:
Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')
In [53]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  int32  
 2   Gender           10000 non-null  int32  
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int32(2), int64(7)
memory usage: 781.4 KB
In [54]:
# numerical columns


numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns

print("Numerical columns:", numerical_columns)
Numerical columns: Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')
In [55]:
# categorical columns


data_types = data.dtypes

# Filter columns with 'object' or 'category' data type
categorical_columns = data_types[data_types == 'object'].index.tolist() 
# .index.tolist(): This extracts the index labels of the selected entries (i.e., the names of the columns)
# and converts them into a list.


# Print the list of categorical columns
print("Categorical columns:", categorical_columns)
Categorical columns: []

there is no categorical data because we have already converted them in numerical

In [56]:
data.isnull()
Out[56]:
CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
0 False False False False False False False False False False False
1 False False False False False False False False False False False
2 False False False False False False False False False False False
3 False False False False False False False False False False False
4 False False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ...
9995 False False False False False False False False False False False
9996 False False False False False False False False False False False
9997 False False False False False False False False False False False
9998 False False False False False False False False False False False
9999 False False False False False False False False False False False

10000 rows × 11 columns

In [57]:
data.isnull().sum()
Out[57]:
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

as we can see there is no missing values present

In [58]:
data.head(2)
Out[58]:
CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
0 619 0 0 42 2 0.00 1 1 1 101348.88 1
1 608 2 0 41 1 83807.86 1 0 1 112542.58 0
In [59]:
(data[data['Exited']==1].shape[0]/data.shape[0])*100
Out[59]:
20.369999999999997

only around 20% of the data is showing churn

In [60]:
churn=data['Exited'].value_counts().reset_index()
churn.head()

# 0 means the customer has not exited and 1 means customer has already exited
Out[60]:
index Exited
0 0 7963
1 1 2037
In [61]:
sns.barplot(x=churn['index'], y=churn['Exited'])


# imbalanced data set
Out[61]:
<AxesSubplot:xlabel='index', ylabel='Exited'>
In [62]:
churning=data[data['Exited']==1]
churning.head(2)
Out[62]:
CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
0 619 0 0 42 2 0.0 1 1 1 101348.88 1
2 502 0 0 42 8 159660.8 3 1 0 113931.57 1
In [63]:
def calculate_ratios(ax):
#     This line defines a function named 
# calculate_ratios that takes one argument, ax, which is assumed to be a Matplotlib Axes object.
    l=len(ax.patches)
#     calculates the total number of patches (bars) in the given Axes object ax and stores it in the variable l.
    
    cnt=0
# The variable cnt is later used to keep track of the number of iterations in the loop.

    
    without_churn=[]
    with_churn=[]
#     2 empty list

    for p in ax.patches:
        
        if cnt<l/2:      
            without_churn.append(p.get_height())
        else:
            with_churn.append(p.get_height())
        cnt+=1
    print("without_churn : ",without_churn)
    print("with_churn : ",with_churn)
    
    ratio=[]
    for i,j in zip(without_churn,with_churn):
        ratio.append(j/i)
    print("ratio : ",ratio)

Geography vs Exited¶

In [64]:
data['Geography'].unique()
Out[64]:
array([0, 2, 1])
In [65]:
ax=sns.countplot(x=data['Geography'],hue=data['Exited'])
plt.xticks(rotation=90)
Out[65]:
(array([0, 1, 2]), [Text(0, 0, '0'), Text(1, 0, '1'), Text(2, 0, '2')])

0 is France 1 is Germany and 2 is Spain

from the above graph, we can see that the most regular customers are from France which are least likely to churn

In [66]:
calculate_ratios(ax)
without_churn :  [4204, 1695, 2064]
with_churn :  [810, 814, 413]
ratio :  [0.19267364414843008, 0.48023598820059, 0.2000968992248062]

so, far from the above ratio, we have seen that Germany has the maximum ratio. The customers who churn mostly are from Germany the germany and france and spain

Gender vs Exited¶

In [67]:
plt.figure(figsize=(6,4))
g=sns.countplot(x=data['Gender'],hue=data['Exited'])
In [68]:
calculate_ratios(g)
without_churn :  [3404, 4559]
with_churn :  [1139, 898]
ratio :  [0.33460634547591067, 0.19697302039921036]
In [69]:
(churning[churning['Gender']==0].shape[0]/data[data['Gender']==0].shape[0])*100
# churn rate percentage of females
Out[69]:
25.071538630860662

Female customers are more likely to churn. On the average almost 25% female customer churn.

Tenure vs Exited¶

In [109]:
plt.figure(figsize=(6,4))
sns.kdeplot(x=data['Tenure'],hue=data['Exited'],multiple='stack')
# kernel density estimation plot
Out[109]:
<AxesSubplot:xlabel='Tenure', ylabel='Density'>

above graph shows the relationship between tenure and churning. The lesser the tenure, more the chances to churn. so one of the ways to reduce customer churning would be to retain the customer for longer tenure, so it reduces the chance of churning.

In [71]:
dev=sns.countplot(x=data['Tenure'],hue=data['Exited'])
In [72]:
calculate_ratios(dev)
without_churn :  [318, 803, 847, 796, 786, 803, 771, 851, 828, 771, 389]
with_churn :  [95, 232, 201, 213, 203, 209, 196, 177, 197, 213, 101]
ratio :  [0.29874213836477986, 0.2889165628891656, 0.23730814639905548, 0.2675879396984925, 0.2582697201017812, 0.2602739726027397, 0.25421530479896237, 0.20799059929494712, 0.23792270531400966, 0.27626459143968873, 0.2596401028277635]

customers with 1 year tenure are more likely to churn

HasCrCard vs Exited¶

In [73]:
status=churning['HasCrCard'].value_counts().reset_index()
In [74]:
status.head()
Out[74]:
index HasCrCard
0 1 1424
1 0 613
In [75]:
Cr=sns.countplot(x=data['HasCrCard'],hue=data['Exited'])
In [76]:
calculate_ratios(Cr)
without_churn :  [2332, 5631]
with_churn :  [613, 1424]
ratio :  [0.26286449399656947, 0.25288581069081867]

the customer who does not have credit cards are more likely to churn

IsActiveMember vs Exited¶

In [77]:
status=sns.countplot(x=data['IsActiveMember'],hue=data['Exited'])
In [78]:
calculate_ratios(status)
without_churn :  [3547, 4416]
with_churn :  [1302, 735]
ratio :  [0.3670707640259374, 0.16644021739130435]

The customers who are not an active members are more likely to churn

NumOfProducts vs IsActiveMember wrt Exited¶

In [79]:
data['NumOfProducts'].unique()
Out[79]:
array([1, 3, 2, 4], dtype=int64)
In [80]:
productpref=sns.barplot(x=data['NumOfProducts'],y=data['IsActiveMember'],hue=data['Exited'])
plt.xticks(rotation=90)
Out[80]:
(array([0, 1, 2, 3]),
 [Text(0, 0, '1'), Text(1, 0, '2'), Text(2, 0, '3'), Text(3, 0, '4')])
In [81]:
calculate_ratios(productpref)
without_churn :  [0.5654421768707483, 0.5445544554455446, 0.6086956521739131, nan]
with_churn :  [0.34421575585521647, 0.39080459770114945, 0.38636363636363635, 0.48333333333333334]
ratio :  [0.6087550061443313, 0.7176593521421107, 0.6347402597402597, nan]
In [82]:
sns.lineplot(x=data['NumOfProducts'],y=data['IsActiveMember'],hue=data['Exited'])
# shadow around the lines represents the confidence interval (CI) of the estimated mean values
# The shade typically indicates the degree of uncertainty associated with the mean estimate at each point along the x-axis.
Out[82]:
<AxesSubplot:xlabel='NumOfProducts', ylabel='IsActiveMember'>

from the above graph, we can see the most of the customers that churn are having more products. This shows that the organisation or the company should focus on long term relations with customer to provide them consistent quality products.

CreditScore vs Exited¶

In [83]:
plt.figure(figsize=(16,10))
sns.histplot(x=data['CreditScore'],hue=data['Exited'],bins=100)

# bidding concept for credit score 
# because the value is very close to each other and conjested looking graph and does not convey any thing 
# bidding is based on frequency and count so when you group them this will clear every thing 
Out[83]:
<AxesSubplot:xlabel='CreditScore', ylabel='Count'>

the customer exited(1) is very low which is either they have low credit score because of their past transactions they would pass to repayment of the lower capacity is very less so they have churned the most and even the customers who have a good credit score they are less likely to charge

Gender vs IsActiveMember¶

In [84]:
data.groupby('Gender')['IsActiveMember'].value_counts()
Out[84]:
Gender  IsActiveMember
0       1                 2284
        0                 2259
1       1                 2867
        0                 2590
Name: IsActiveMember, dtype: int64
In [85]:
sns.barplot(x=data['Gender'],y=data['IsActiveMember'], hue=data['Exited'])

# line on the top is error bars indicates the uncertainty or variability in the data 
Out[85]:
<AxesSubplot:xlabel='Gender', ylabel='IsActiveMember'>

nearly both men and women are active members, not so much gender bias

In [86]:
data.columns
Out[86]:
Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

correlation = data['CustomerId'].corr(data['Exited']) print("Correlation between CustomerID and Exited:", correlation)

the value is -0.0006.........something thats why we drop it in the begining

In [87]:
data.dtypes
Out[87]:
CreditScore          int64
Geography            int32
Gender               int32
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

checking the inconsistent data values in each columns

In [88]:
data.head(5)
Out[88]:
CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
0 619 0 0 42 2 0.00 1 1 1 101348.88 1
1 608 2 0 41 1 83807.86 1 0 1 112542.58 0
2 502 0 0 42 8 159660.80 3 1 0 113931.57 1
3 699 0 0 39 1 0.00 2 0 0 93826.63 0
4 850 2 0 43 2 125510.82 1 1 1 79084.10 0
In [89]:
# Isactivemember is grouped according to the hascrcard variables and the dependent variable is examined.
data.groupby(["IsActiveMember", "HasCrCard"]).agg({"Exited":"count"})
Out[89]:
Exited
IsActiveMember HasCrCard
0 0 1401
1 3448
1 0 1544
1 3607

checking the number of people who are active or not and has credit card or not and yet has exited or not

In [90]:
# Isactivemember is grouped according to hascrcard variables and the balance variable is examined.
data.groupby(["IsActiveMember", "HasCrCard"]).agg({"Balance" : "mean"})
Out[90]:
Balance
IsActiveMember HasCrCard
0 0 77825.424525
1 76853.588646
1 0 78007.318381
1 74962.849983

people who are active member and has cresdit card with balance

In [91]:
# The balance variable was examined according to the gender variable.
data.groupby("Gender").agg({"Balance": "mean"})
Out[91]:
Balance
Gender
0 75659.369139
1 77173.974506

Male and female and there balance

In [92]:
# How many people whose balance is 0 and do not leave?
data[(data["Balance"] == 0) & (data["Exited"] == 0)]
Out[92]:
CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
3 699 0 0 39 1 0.0 2 0 0 93826.63 0
6 822 0 1 50 7 0.0 2 1 1 10062.80 0
11 497 2 1 24 3 0.0 2 1 0 76390.01 0
12 476 0 0 34 10 0.0 2 1 0 26260.98 0
13 549 0 0 25 5 0.0 2 0 0 190857.79 0
... ... ... ... ... ... ... ... ... ... ... ...
9988 775 0 1 30 4 0.0 2 1 0 49337.84 0
9989 841 2 1 28 4 0.0 2 1 1 179436.60 0
9992 726 2 1 36 2 0.0 1 1 0 195192.40 0
9994 800 0 0 29 2 0.0 2 0 0 167773.55 0
9995 771 0 1 39 5 0.0 2 1 0 96270.64 0

3117 rows × 11 columns

how many people whose balance is 0 yet has not churn

In [93]:
# How many people whose balance is 0 leave?
data[(data["Balance"] == 0) & (data["Exited"] == 1)]
Out[93]:
CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
0 619 0 0 42 2 0.0 1 1 1 101348.88 1
22 510 2 0 38 4 0.0 1 1 0 118913.53 1
30 591 2 0 39 3 0.0 3 1 0 140469.38 1
58 511 2 0 66 4 0.0 1 1 0 1643.11 1
81 777 0 0 32 2 0.0 1 1 0 136458.19 1
... ... ... ... ... ... ... ... ... ... ... ...
9784 527 0 1 39 4 0.0 2 1 0 167183.07 1
9884 751 0 0 48 4 0.0 1 0 1 30165.06 1
9898 589 0 1 38 4 0.0 1 1 0 95483.48 1
9962 702 2 1 44 9 0.0 1 0 0 59207.41 1
9997 709 0 0 36 7 0.0 1 0 1 42085.58 1

500 rows × 11 columns

people with 0 balance who has churn

Correlation¶

In [94]:
# Access to the correlation of the data set was provided. What kind of relationship is examined between the variables. 
# If the correlation value is> 0, there is a positive correlation. While the value of one variable increases, the value of the other variable also increases.
# Correlation = 0 means no correlation.
# If the correlation is <0, there is a negative correlation. While one variable increases, the other variable decreases. 
# When the correlations are examined, there are 1 variables that act as a positive correlation to the exited dependent variable.
# This variable is Age. As this increases, the Result variable increases.
data.corr()
Out[94]:
CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
CreditScore 1.000000 0.007888 -0.002857 -0.003965 0.000842 0.006268 0.012238 -0.005458 0.025651 -0.001384 -0.027094
Geography 0.007888 1.000000 0.004719 0.022812 0.003739 0.069408 0.003972 -0.008523 0.006724 -0.001369 0.035943
Gender -0.002857 0.004719 1.000000 -0.027544 0.014733 0.012087 -0.021859 0.005766 0.022544 -0.008112 -0.106512
Age -0.003965 0.022812 -0.027544 1.000000 -0.009997 0.028308 -0.030680 -0.011721 0.085472 -0.007201 0.285323
Tenure 0.000842 0.003739 0.014733 -0.009997 1.000000 -0.012254 0.013444 0.022583 -0.028362 0.007784 -0.014001
Balance 0.006268 0.069408 0.012087 0.028308 -0.012254 1.000000 -0.304180 -0.014858 -0.010084 0.012797 0.118533
NumOfProducts 0.012238 0.003972 -0.021859 -0.030680 0.013444 -0.304180 1.000000 0.003183 0.009612 0.014204 -0.047820
HasCrCard -0.005458 -0.008523 0.005766 -0.011721 0.022583 -0.014858 0.003183 1.000000 -0.011866 -0.009933 -0.007138
IsActiveMember 0.025651 0.006724 0.022544 0.085472 -0.028362 -0.010084 0.009612 -0.011866 1.000000 -0.011421 -0.156128
EstimatedSalary -0.001384 -0.001369 -0.008112 -0.007201 0.007784 0.012797 0.014204 -0.009933 -0.011421 1.000000 0.012097
Exited -0.027094 0.035943 -0.106512 0.285323 -0.014001 0.118533 -0.047820 -0.007138 -0.156128 0.012097 1.000000
In [95]:
# Correlation Matrix
f, ax = plt.subplots(figsize= [12,8])
sns.heatmap(data.corr(), annot=True, fmt=".2f", ax=ax, cmap = "magma" )
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()

Distribution of Exited¶

In [96]:
# The distribution of the dependent variable in the dataset is plotted as pie and columns graphs.
f, ax = plt.subplots(1, 2, figsize=(18, 8))
data['Exited'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Distribution')
ax[0].set_ylabel('')
sns.countplot(x='Exited', data=data, ax=ax[1])
ax[1].set_title('Exited')
plt.show()
In [97]:
# Plotted the categorical variables on the basis of the graph of the column according to the dependent variable.
fig, axarr = plt.subplots(2, 2, figsize=(20, 12))
sns.countplot(x='Geography', hue = 'Exited',data = data, ax=axarr[0][0])
sns.countplot(x='Gender', hue = 'Exited',data = data, ax=axarr[0][1])
sns.countplot(x='HasCrCard', hue = 'Exited',data = data, ax=axarr[1][0])
sns.countplot(x='IsActiveMember', hue = 'Exited',data = data, ax=axarr[1][1])
Out[97]:
<AxesSubplot:xlabel='IsActiveMember', ylabel='count'>

Exited vs Age vs Geography¶

In [41]:
# Dependent variable was plotted according to age and geography variable.
import plotly.express as px
fig = px.bar(data,y = "Exited", x = "Age" , color = "Geography")
fig.show()
In [42]:
import plotly.express as px

labels = ['Female-Not Exited', 'Female-Exited', 'Male-Not Exited', 'Male-Exited']
Gender = data.groupby('Gender')['Exited'].value_counts().reset_index(name='count')
fig = px.pie(Gender, values='count', names=labels, title='Exited by Gender', hole=0.5)
fig.show()

Outliers detection¶

In [98]:
# Boxplot graph for outlier observation analysis
fig, axarr = plt.subplots(3, 2, figsize=(20, 12))
sns.boxplot(y='CreditScore',x = 'Exited', hue = 'Exited',data = data, ax=axarr[0][0])
sns.boxplot(y='Age',x = 'Exited', hue = 'Exited',data = data , ax=axarr[0][1])
sns.boxplot(y='Tenure',x = 'Exited', hue = 'Exited',data = data, ax=axarr[1][0])
sns.boxplot(y='Balance',x = 'Exited', hue = 'Exited',data = data, ax=axarr[1][1])
sns.boxplot(y='NumOfProducts',x = 'Exited', hue = 'Exited',data = data, ax=axarr[2][0])
sns.boxplot(y='EstimatedSalary',x = 'Exited', hue = 'Exited',data = data, ax=axarr[2][1])
Out[98]:
<AxesSubplot:xlabel='Exited', ylabel='EstimatedSalary'>
In [110]:
# Outlier Observation Analysis
for feature in data[['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited']]:
    
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3-Q1
    lower = Q1- 1.5*IQR
    upper = Q3 + 1.5*IQR
    
    if data[(data[feature] > upper)].any(axis=None):
        print(feature,"yes")
    else:
        print(feature, "no")
CreditScore no
Geography no
Gender no
Age yes
Tenure no
Balance no
NumOfProducts yes
HasCrCard no
IsActiveMember no
EstimatedSalary no
Exited yes

Model Building¶

In [101]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}
x, y = data.drop('Exited', axis=1), data['Exited']

Train Test Splitting¶

In [102]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# n_splits=1 indicates you want only one split.
# test_size=0.2 specifies that 20% of the data will be used for testing.
# random_state=42 sets the random seed for reproducibility.

for train_index, test_index in sss.split(x, y):
    x_train, x_test = x.loc[train_index], x.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
# The .loc accessor in pandas allows us to select data based on labels of rows and/or columns.
In [103]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape
Out[103]:
((8000, 10), (2000, 10), (8000,), (2000,))
In [104]:
x_train.head()
Out[104]:
CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary
2151 753 0 1 57 7 0.00 1 1 0 159475.08
8392 739 1 1 32 3 102128.27 1 1 0 63981.37
5006 755 1 0 37 0 113865.23 2 1 1 117396.25
4117 561 0 1 37 5 0.00 2 1 0 83093.25
7182 692 1 1 49 6 110540.43 2 0 1 107472.99

Model Training¶

In [105]:
for name, model in models.items():
    model.fit(x_train, y_train)
#     model.fit(x_train, y_train): This fits the model to the training data (x_train, y_train), where x_train contains the features and y_train contains the target variable.
    
    y_pred = model.predict(x_test)
#     make predictions on the test data (x_test), producing predicted values for the target variable.
    
    accuracy = accuracy_score(y_test, y_pred)
#     comparing y_test and y_pred for accuracy
    
    print(f'{name} - Training Accuracy: {accuracy * 100}%')
# print(f'{name} - Training Accuracy: {accuracy * 100}%'): This prints out the name of the model along 
# with its training accuracy, where the accuracy is multiplied by 100 to convert it to a percentage.
Logistic Regression - Training Accuracy: 79.65%
Support Vector Machine - Training Accuracy: 79.65%
Random Forest - Training Accuracy: 86.15%
Decision Tree - Training Accuracy: 78.60000000000001%
K-Nearest Neighbors - Training Accuracy: 76.4%

Classification Report¶

In [107]:
# for name, model in models.items():
#     print(f'{name} - classification_report: \n{classification_report(y_test, model.predict(x_test))}')

from sklearn.metrics import classification_report

for name, model in models.items():
    # Fit the model to the training data
    model.fit(x_train, y_train)
    
    # Generate predictions on the test set
    y_pred = model.predict(x_test)
    
    # Print the classification report for the model
    print(f'{name} - classification_report: \n{classification_report(y_test, y_pred, zero_division=1)}')
    print()
Logistic Regression - classification_report: 
              precision    recall  f1-score   support

           0       0.80      1.00      0.89      1593
           1       1.00      0.00      0.00       407

    accuracy                           0.80      2000
   macro avg       0.90      0.50      0.44      2000
weighted avg       0.84      0.80      0.71      2000


Support Vector Machine - classification_report: 
              precision    recall  f1-score   support

           0       0.80      1.00      0.89      1593
           1       1.00      0.00      0.00       407

    accuracy                           0.80      2000
   macro avg       0.90      0.50      0.44      2000
weighted avg       0.84      0.80      0.71      2000


Random Forest - classification_report: 
              precision    recall  f1-score   support

           0       0.87      0.96      0.92      1593
           1       0.77      0.46      0.58       407

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.75      2000
weighted avg       0.85      0.86      0.85      2000


Decision Tree - classification_report: 
              precision    recall  f1-score   support

           0       0.87      0.86      0.86      1593
           1       0.47      0.48      0.47       407

    accuracy                           0.78      2000
   macro avg       0.67      0.67      0.67      2000
weighted avg       0.79      0.78      0.78      2000


K-Nearest Neighbors - classification_report: 
              precision    recall  f1-score   support

           0       0.80      0.94      0.86      1593
           1       0.26      0.08      0.13       407

    accuracy                           0.76      2000
   macro avg       0.53      0.51      0.49      2000
weighted avg       0.69      0.76      0.71      2000


Kfold Cross-Validation¶

In [108]:
for name, model in models.items():
    kfold = KFold(n_splits=5, random_state=42, shuffle=True)
    print(f'{name} - Cross Validation Score: {round(cross_val_score(model, x_test, y_test, cv=kfold).mean() * 100,3)}%')
# cross_val_score function automatically handles the splitting of the data into training and
# testing sets for each fold of cross-validation.
Logistic Regression - Cross Validation Score: 78.7%
Support Vector Machine - Cross Validation Score: 79.65%
Random Forest - Cross Validation Score: 85.7%
Decision Tree - Cross Validation Score: 79.75%
K-Nearest Neighbors - Cross Validation Score: 76.4%
In [ ]: